import scrapy


class BaiduSpider(scrapy.Spider):
    # 爬虫的名字，用于运行爬虫的时候使用的值
    name = "baidu"
    # 仅允许访问的域名
    allowed_domains = ["www.baidu.com"]
    # 起始的url地址：指第一次要访问的域名
    # start_urls 是在 allowed_domains -----> 前面加 http:// + 后面加 /(要删去，不然会报错)
    start_urls = ["http://www.baidu.com"]

    # 是执行了start_urls之后执行的方法 ，方法中的 response就是返回的那个对象---->response = urllib.request.url_open() / requests.get()
    def parse(self, response):
        print("*" * 100)
        print(response.text)  # 获得 html的字符串
        print(response.body)  # 获得 二进制的网页 html的字符串
        lst = response.xpath('//div[@id="s-top-left"]/a')  # 是Selector组成的列表
        print(response.xpath('//div[@id="s-top-left"]/a/text()').exact())  # 获得列表中所有对象的标签对象< >  ：list
        print(response.xpath('//div[@id="s-top-left"]/a/text()').exact_first())  # 获得第一个对象的标签对象< >  :str
        print(lst)
        for i in lst:
            print(i.exact())  # 获得该对象的标签对象< >  :str


"""
In [11]: 
print(response.xpath('//h3/a/@title'))
# scrapy.selector.unified.SelectorList 是Selector组成的列表
Out[11]:
# 为了方便阅读换行符我手打的
[<Selector xpath='//h3/a/@title' data='A Light in the Attic'>,
<Selector xpath='//h3/a/@title' data='Tipping the Velvet'>, 
<Selector xpath='//h3/a/@title' data='Soumission'>, 
<Selector xpath='//h3/a/@title' data='Sharp Objects'>, 
<Selector xpath='//h3/a/@title' data='Sapiens: A Brief History of Humankind'>, 
<Selector xpath='//h3/a/@title' data='The Requiem Red'>,
<Selector xpath='//h3/a/@title' data='The Dirty Little Secrets of Getting Y...'>, 
<Selector xpath='//h3/a/@title' data='The Coming Woman: A Novel Based on th...'>,
<Selector xpath='//h3/a/@title' data='The Boys in the Boat: Nine Americans ...'>,]
"""

"""
In [9]: print(response.xpath('//h3/a/@title').extract())
# List
Out[9]:
# 为了方便阅读换行符我手打的
['A Light in the Attic', 
 'Tipping the Velvet', 
 'Soumission', 
 'Sharp Objects', 
 'Sapiens: A Brief History of Humankind', 
 'The Requiem Red', 
 'The Dirty Little Secrets of Getting Your Dream Job', 
 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics', ]
"""

"""
In [7]: print(response.xpath('//h3/a/@title').extract_first())
# Str
Out[7]:
A Light in the Attic
"""
